{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pickle\n",
    "import operator\n",
    "import pandas as pd\n",
    "import jieba\n",
    "from language.langconv import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache E:\\Temp\\jieba.cache\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Hi.\\t嗨。', 'Hi.\\t你好。', 'Run.\\t你用跑的。', 'Wait!\\t等等！', 'Hello!\\t你好。', 'I try.\\t让我来。', 'I won!\\t我赢了。', 'Oh no!\\t不会吧。', 'Cheers!\\t乾杯!', 'He ran.\\t他跑了。']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model cost 0.781 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20403\n",
      "['Hop in .', 'I lost .', 'I quit .', \"I'm OK .\", 'Listen .', 'No way !', 'No way !', 'Really ?', 'Try it .', 'We try .']\n",
      "['跳进来 。', '我 迷失 了 。', '我 退出 。', '我 没事 。', '听 着 。', '不 可能 ！', '没门 ！', '你 确定 ？', '试试 吧 。', '我们 来 试试 。']\n",
      "['BOS 跳进来 。 EOS', 'BOS 我 迷失 了 。 EOS', 'BOS 我 退出 。 EOS', 'BOS 我 没事 。 EOS', 'BOS 听 着 。 EOS', 'BOS 不 可能 ！ EOS', 'BOS 没门 ！ EOS', 'BOS 你 确定 ？ EOS', 'BOS 试试 吧 。 EOS', 'BOS 我们 来 试试 。 EOS']\n",
      "['跳进来 。 EOS', '我 迷失 了 。 EOS', '我 退出 。 EOS', '我 没事 。 EOS', '听 着 。 EOS', '不 可能 ！ EOS', '没门 ！ EOS', '你 确定 ？ EOS', '试试 吧 。 EOS', '我们 来 试试 。 EOS']\n"
     ]
    }
   ],
   "source": [
    "def Traditional2Simplified(sentence):\n",
    "    sentence = Converter('zh-hans').convert(sentence)\n",
    "    return sentence\n",
    "with open('cmn.txt', 'r', encoding='utf-8') as f:\n",
    "    lines = f.read().split('\\n');\n",
    "eng=[]\n",
    "cns=[]\n",
    "print(lines[:10])\n",
    "for pos, line in enumerate(lines):\n",
    "    line = line.split('\\t')\n",
    "    e = line[0][:-1] + \" \" + line[0][-1:]\n",
    "    c = line[1]\n",
    "    eng.append(' '.join(e.split(' ')))\n",
    "    cns.append(' '.join(jieba.lcut(Traditional2Simplified(c).strip(), cut_all=False)))\n",
    "print(len(cns))\n",
    "print(eng[10:20])\n",
    "print(cns[10:20])\n",
    "\n",
    "maxLen=18\n",
    "for pos, seq in enumerate(eng):\n",
    "    seq_list = seq.split(' ')\n",
    "    if len(seq_list) > maxLen:\n",
    "        seq_list = seq_list[:maxLen]\n",
    "    eng[pos] = ' '.join(seq_list)\n",
    "for pos, seq in enumerate(cns):\n",
    "    seq_list = seq.split(' ')    \n",
    "    if len(seq_list) > maxLen:\n",
    "        seq_list = seq_list[:maxLen]\n",
    "    cns[pos] = ' '.join(seq_list)\n",
    "\n",
    "cns_a = ['BOS '+i+' EOS' for i in cns]\n",
    "cns_b = [i+' EOS' for i in cns]\n",
    "print(cns_a[10:20])\n",
    "print(cns_b[10:20])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab_bag:  17751\n"
     ]
    }
   ],
   "source": [
    "import  pickle\n",
    "counts = {}\n",
    "BE = ['BOS', 'EOS']\n",
    "for word_list in eng+cns + BE:\n",
    "    for word in word_list.split(' '):\n",
    "        counts[word] = counts.get(word, 0) + 1 \n",
    "word_to_index = {}\n",
    "for pos, i in enumerate(counts.keys()):\n",
    "    word_to_index[i] = pos\n",
    "    \n",
    "index_to_word = {}\n",
    "for pos, i in enumerate(counts.keys()):\n",
    "    index_to_word[pos] = i\n",
    "    \n",
    "vocab_bag =list(word_to_index.keys())\n",
    "with open('word_to_index.pkl', 'wb') as f:\n",
    "    pickle.dump(word_to_index, f, pickle.HIGHEST_PROTOCOL)\n",
    "with open('index_to_word.pkl', 'wb') as f:\n",
    "    pickle.dump(index_to_word, f, pickle.HIGHEST_PROTOCOL)\n",
    "with open('vocab_bag.pkl', 'wb') as f:\n",
    "    pickle.dump(vocab_bag, f, pickle.HIGHEST_PROTOCOL)\n",
    "\n",
    "print('vocab_bag: ', len(vocab_bag))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[list([0, 1]) list([0, 1]) list([2, 1])]\n",
      "[list([17749, 7300, 7301, 17750]) list([17749, 7302, 7301, 17750])\n",
      " list([17749, 7303, 7304, 7305, 7306, 7301, 17750])]\n",
      "[list([7300, 7301, 17750]) list([7302, 7301, 17750])\n",
      " list([7303, 7304, 7305, 7306, 7301, 17750])]\n"
     ]
    }
   ],
   "source": [
    "eng = np.array([[word_to_index[w] for w in i.split(' ')] for i in eng])\n",
    "cns_a = np.array([[word_to_index[w] for w in i.split(' ')] for i in cns_a])\n",
    "cns_b = np.array([[word_to_index[w] for w in i.split(' ')] for i in cns_b])\n",
    "print(eng[:3])\n",
    "print(cns_a[:3])\n",
    "print(cns_b[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eng:  20403 \n",
      " cns:  20403\n",
      "Done!\n"
     ]
    }
   ],
   "source": [
    "import os \n",
    "import numpy as np\n",
    "print('eng: ', len(eng), '\\n', 'cns: ', len(cns))\n",
    "np.save('eng.npy', eng[:20000])\n",
    "np.save('cns_a.npy', cns_a[:20000])\n",
    "np.save('cns_b.npy', cns_b[:20000])\n",
    "print('Done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cns_a.shape:  (20000,)\n",
      "word_to_vec_map:  17751\n",
      "vocab_size:  17752\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pickle\n",
    "import operator\n",
    "eng = np.load('eng.npy')\n",
    "cns_a = np.load('cns_a.npy')\n",
    "cns_b = np.load('cns_b.npy')\n",
    "print('cns_a.shape: ', cns_a.shape)\n",
    "with open('word_to_index.pkl', 'rb') as f:\n",
    "    word_to_index = pickle.load(f)\n",
    "for i, j in word_to_index.items():\n",
    "    word_to_index[i] = j + 1\n",
    "index_to_word = {}\n",
    "for key, value in word_to_index.items():\n",
    "    index_to_word[value] = key\n",
    "pad_eng = eng\n",
    "pad_cns_a = cns_a\n",
    "pad_cns_b = cns_b\n",
    "maxLen=20\n",
    "for pos, i in enumerate(pad_eng):\n",
    "    for pos_, j in enumerate(i):\n",
    "        i[pos_] = j + 1\n",
    "    if(len(i) > maxLen):\n",
    "        pad_eng[pos] = i[:maxLen]\n",
    "    \n",
    "for pos, i in enumerate(pad_cns_a):\n",
    "    for pos_, j in enumerate(i):\n",
    "        i[pos_] = j + 1\n",
    "    if(len(i) > maxLen):\n",
    "        pad_cns_a[pos] = i[:maxLen]\n",
    "for pos, i in enumerate(pad_cns_b):\n",
    "    for pos_, j in enumerate(i):\n",
    "        i[pos_] = j + 1\n",
    "    if(len(i) > maxLen):\n",
    "        pad_cns_b[pos] = i[:maxLen]\n",
    "np.save('cns_o.npy', pad_cns_b)        \n",
    "    \n",
    "with open('vocab_bag.pkl', 'rb') as f:\n",
    "    words = pickle.load(f)\n",
    "vocab_size = len(word_to_index) + 1\n",
    "print('word_to_vec_map: ', len(list(words)))\n",
    "print('vocab_size: ', vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[17750  7301  7302 17751     0     0     0     0     0     0     0     0\n",
      "      0     0     0     0     0     0     0     0]\n",
      " [17750  7303  7302 17751     0     0     0     0     0     0     0     0\n",
      "      0     0     0     0     0     0     0     0]\n",
      " [17750  7304  7305  7306  7307  7302 17751     0     0     0     0     0\n",
      "      0     0     0     0     0     0     0     0]]\n",
      "[list([7301, 7302, 17751]) list([7303, 7302, 17751])\n",
      " list([7304, 7305, 7306, 7307, 7302, 17751])]\n"
     ]
    }
   ],
   "source": [
    "from keras.preprocessing import sequence\n",
    "#后端padding\n",
    "pad_eng = sequence.pad_sequences(pad_eng, maxlen=maxLen,\n",
    "                                      dtype='int32', padding='post', \n",
    "                                       truncating='post')\n",
    "pad_cns = sequence.pad_sequences(pad_cns_a, maxlen=maxLen,\n",
    "                                 dtype='int32', padding='post',\n",
    "                                 truncating='post')\n",
    "\n",
    "def get_file_list(file_path):\n",
    "    dir_list = os.listdir(file_path)\n",
    "    if not dir_list:\n",
    "        return\n",
    "    else:\n",
    "        dir_list = sorted(dir_list, key=lambda x: os.path.getmtime(os.path.join(file_path, x)))\n",
    "    return dir_list\n",
    "\n",
    "with open('pad_word_to_index.pkl', 'wb') as f:\n",
    "    pickle.dump(word_to_index, f, pickle.HIGHEST_PROTOCOL)\n",
    "with open('pad_index_to_word.pkl', 'wb') as f:\n",
    "    pickle.dump(index_to_word, f, pickle.HIGHEST_PROTOCOL)\n",
    "np.save('pad_eng.npy', pad_eng)\n",
    "np.save('pad_cns.npy', pad_cns)\n",
    "    \n",
    "print(pad_cns[:3])\n",
    "print(pad_cns_b[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
